Importo librerías¶
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
import plotly.express as px
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
import scipy.stats as ss
import warnings
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 5000)
Funciones¶
def plot_feature(df, col_name, isContinuous, target):
"""
Visualize a variable with and without faceting on the loan status.
- df dataframe
- col_name is the variable name in the dataframe
- full_name is the full variable name
- continuous is True if the variable is continuous, False otherwise
"""
f, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(12,3), dpi=90)
count_null = df[col_name].isnull().sum()
if isContinuous:
sns.histplot(df.loc[df[col_name].notnull(), col_name], kde=False, ax=ax1)
else:
sns.countplot(df, x=col_name, color='#5975A4', saturation=1, ax=ax1)
ax1.set_xlabel(col_name)
ax1.set_ylabel('Count')
ax1.set_title(col_name+ ' Numero de nulos: '+str(count_null))
plt.xticks(rotation = 90)
if isContinuous:
sns.boxplot(x=col_name, y=target, data=df, ax=ax2)
ax2.set_ylabel('')
ax2.set_title(col_name + ' by '+target)
else:
data = df.groupby(col_name)[target].value_counts(normalize=True).to_frame('proportion').reset_index()
data.columns = [i, target, 'proportion']
#sns.barplot(x = col_name, y = 'proportion', hue= target, data = data, saturation=1, ax=ax2)
sns.barplot(x = col_name, y = 'proportion', hue= target, data = data, saturation=1, ax=ax2)
ax2.set_ylabel(target+' fraction')
ax2.set_title(target)
plt.xticks(rotation = 90)
ax2.set_xlabel(col_name)
plt.tight_layout()
def dame_variables_categoricas(dataset=None):
'''
----------------------------------------------------------------------------------------------------------
Función dame_variables_categoricas:
----------------------------------------------------------------------------------------------------------
-Descripción: Función que recibe un dataset y devuelve una lista con los nombres de las
variables categóricas
-Inputs:
-- dataset: Pandas dataframe que contiene los datos
-Return:
-- lista_variables_categoricas: lista con los nombres de las variables categóricas del
dataset de entrada con menos de 100 valores diferentes
-- 1: la ejecución es incorrecta
'''
if dataset is None:
print(u'\nFaltan argumentos por pasar a la función')
return 1
lista_variables_categoricas = []
other = []
for i in dataset.columns:
if (dataset[i].dtype!=float) & (dataset[i].dtype!=int):
unicos = int(len(np.unique(dataset[i].dropna(axis=0, how='all'))))
if unicos < 100:
lista_variables_categoricas.append(i)
else:
other.append(i)
return lista_variables_categoricas, other
def get_corr_matrix(dataset = None, metodo='pearson', size_figure=[10,8]):
# Para obtener la correlación de Spearman, sólo cambiar el metodo por 'spearman'
if dataset is None:
print(u'\nHace falta pasar argumentos a la función')
return 1
sns.set(style="white")
# Compute the correlation matrix
corr = dataset.corr(method=metodo)
# Set self-correlation to zero to avoid distraction
for i in range(corr.shape[0]):
corr.iloc[i, i] = 0
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=size_figure)
# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, center=0,
square=True, linewidths=.5, cmap ='viridis' ) #cbar_kws={"shrink": .5}
plt.show()
return 0
def get_deviation_of_mean_perc(pd_loan, list_var_continuous, target, multiplier):
"""
Devuelve el porcentaje de valores que exceden del intervalo de confianza
:type series:
:param multiplier:
:return:
"""
pd_final = pd.DataFrame()
for i in list_var_continuous:
series_mean = pd_loan[i].mean()
series_std = pd_loan[i].std()
std_amp = multiplier * series_std
left = series_mean - std_amp
right = series_mean + std_amp
size_s = pd_loan[i].size
perc_goods = pd_loan[i][(pd_loan[i] >= left) & (pd_loan[i] <= right)].size/size_s
perc_excess = pd_loan[i][(pd_loan[i] < left) | (pd_loan[i] > right)].size/size_s
if perc_excess>0:
pd_concat_percent = pd.DataFrame(pd_loan[target][(pd_loan[i] < left) | (pd_loan[i] > right)]\
.value_counts(normalize=True).reset_index()).T
pd_concat_percent.columns = [pd_concat_percent.iloc[0,0],
pd_concat_percent.iloc[0,1]]
pd_concat_percent = pd_concat_percent.drop('fraud_bool',axis=0)
pd_concat_percent['variable'] = i
pd_concat_percent['sum_outlier_values'] = pd_loan[i][(pd_loan[i] < left) | (pd_loan[i] > right)].size
pd_concat_percent['porcentaje_sum_null_values'] = perc_excess
pd_final = pd.concat([pd_final, pd_concat_percent], axis=0).reset_index(drop=True)
if pd_final.empty:
print('No existen variables con valores nulos')
return pd_final
def get_percent_null_values_target(pd_loan, list_var_continuous, target):
pd_final = pd.DataFrame()
for i in list_var_continuous:
if i in ['prev_address_months_count', 'current_address_months_count', 'bank_months_count',
'session_length_in_minutes', 'device_distinct_emails_8w']:
if (pd_loan[i] == -1).sum() > 0:
pd_concat_percent = pd.DataFrame(pd_loan[target][pd_loan[i] == -1] \
.value_counts(normalize=True).reset_index()).T
pd_concat_percent.columns = [pd_concat_percent.iloc[0, 0],
pd_concat_percent.iloc[0, 1]]
pd_concat_percent = pd_concat_percent.drop(target, axis=0)
pd_concat_percent['variable'] = i
pd_concat_percent['sum_null_values'] = (pd_loan[i] == -1).sum()
pd_concat_percent['porcentaje_sum_null_values'] = (pd_loan[i] == -1).sum() / pd_loan.shape[0]
pd_final = pd.concat([pd_final, pd_concat_percent], axis=0).reset_index(drop=True)
elif i == 'intended_balcon_amount':
if (pd_loan[i] < 0).sum() > 0:
pd_concat_percent = pd.DataFrame(pd_loan[target][pd_loan[i] < 0] \
.value_counts(normalize=True).reset_index()).T
pd_concat_percent.columns = [pd_concat_percent.iloc[0, 0],
pd_concat_percent.iloc[0, 1]]
pd_concat_percent = pd_concat_percent.drop(target, axis=0)
pd_concat_percent['variable'] = i
pd_concat_percent['sum_null_values'] = (pd_loan[i] < 0).sum()
pd_concat_percent['porcentaje_sum_null_values'] = (pd_loan[i] < 0).sum() / pd_loan.shape[0]
pd_final = pd.concat([pd_final, pd_concat_percent], axis=0).reset_index(drop=True)
else:
if pd_loan[i].isnull().sum() > 0:
pd_concat_percent = pd.DataFrame(pd_loan[target][pd_loan[i].isnull()] \
.value_counts(normalize=True).reset_index()).T
pd_concat_percent.columns = [pd_concat_percent.iloc[0, 0],
pd_concat_percent.iloc[0, 1]]
pd_concat_percent = pd_concat_percent.drop(target, axis=0)
pd_concat_percent['variable'] = i
pd_concat_percent['sum_null_values'] = pd_loan[i].isnull().sum()
pd_concat_percent['porcentaje_sum_null_values'] = pd_loan[i].isnull().sum() / pd_loan.shape[0]
pd_final = pd.concat([pd_final, pd_concat_percent], axis=0).reset_index(drop=True)
if pd_final.empty:
print('No existen variables con valores nulos')
return pd_final
def cramers_v(confusion_matrix):
"""
calculate Cramers V statistic for categorial-categorial association.
uses correction from Bergsma and Wicher,
Journal of the Korean Statistical Society 42 (2013): 323-328
confusion_matrix: tabla creada con pd.crosstab()
"""
chi2 = ss.chi2_contingency(confusion_matrix)[0]
n = confusion_matrix.sum()
phi2 = chi2 / n
r, k = confusion_matrix.shape
phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))
rcorr = r - ((r-1)**2)/(n-1)
kcorr = k - ((k-1)**2)/(n-1)
return np.sqrt(phi2corr / min((kcorr-1), (rcorr-1)))
Cargamos los datos¶
Carga de los datos del preprocesamiento y agrupamiento de los distintos tipos de variables en varias listas (categóricas, enteros, continuos, numéricos y todos las variables juntas)
pd_fraud = pd.read_csv('./data_preprocessing/pd_data_initial_preprocessing.csv')
pd_fraud = pd_fraud.drop(columns='Unnamed: 0')
pd_fraud
| fraud_bool | income | name_email_similarity | prev_address_months_count | current_address_months_count | customer_age | days_since_request | intended_balcon_amount | payment_type | zip_count_4w | velocity_6h | velocity_24h | velocity_4w | bank_branch_count_8w | date_of_birth_distinct_emails_4w | employment_status | credit_risk_score | email_is_free | housing_status | phone_home_valid | phone_mobile_valid | bank_months_count | has_other_cards | proposed_credit_limit | foreign_request | source | session_length_in_minutes | device_os | keep_alive_session | device_distinct_emails_8w | device_fraud_count | month | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 0.9 | 0.166828 | -1 | 88 | 50 | 0.020925 | -1.331345 | AA | 769 | 10650.765523 | 3134.319630 | 3863.647740 | 1 | 6 | CA | 185 | 0 | BA | 1 | 0 | 24 | 0 | 500.0 | 0 | INTERNET | 3.888115 | windows | 0 | 1 | 0 | 7 |
| 1 | 1 | 0.9 | 0.296286 | -1 | 144 | 50 | 0.005418 | -0.816224 | AB | 366 | 534.047319 | 2670.918292 | 3124.298166 | 718 | 3 | CA | 259 | 1 | BA | 0 | 0 | 15 | 0 | 1500.0 | 0 | INTERNET | 31.798819 | windows | 0 | 1 | 0 | 7 |
| 2 | 1 | 0.9 | 0.044985 | -1 | 132 | 40 | 3.108549 | -0.755728 | AC | 870 | 4048.534263 | 2893.621498 | 3159.590679 | 1 | 14 | CB | 177 | 1 | BA | 0 | 1 | -1 | 0 | 200.0 | 0 | INTERNET | 4.728705 | other | 0 | 1 | 0 | 7 |
| 3 | 1 | 0.9 | 0.159511 | -1 | 22 | 50 | 0.019079 | -1.205124 | AB | 810 | 3457.064063 | 4054.908412 | 3022.261812 | 1921 | 6 | CA | 110 | 1 | BA | 0 | 1 | 31 | 1 | 200.0 | 0 | INTERNET | 2.047904 | linux | 0 | 1 | 0 | 7 |
| 4 | 1 | 0.9 | 0.596414 | -1 | 218 | 50 | 0.004441 | -0.773276 | AB | 890 | 5020.341679 | 2728.237159 | 3087.670952 | 1990 | 2 | CA | 295 | 1 | BA | 1 | 0 | 31 | 0 | 1500.0 | 0 | INTERNET | 3.775225 | macintosh | 1 | 1 | 0 | 7 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 999995 | 0 | 0.6 | 0.192631 | -1 | 104 | 40 | 0.030592 | -1.044454 | AB | 804 | 7905.711839 | 8341.468557 | 4972.635997 | 1 | 8 | CA | 75 | 1 | BC | 1 | 1 | 25 | 0 | 200.0 | 0 | INTERNET | 8.511502 | linux | 1 | 1 | 0 | 4 |
| 999996 | 0 | 0.8 | 0.322989 | 148 | 9 | 50 | 1.628119 | -1.409803 | AC | 3306 | 5391.470463 | 4955.170808 | 5022.728108 | 0 | 2 | CC | 154 | 1 | BC | 1 | 1 | -1 | 0 | 200.0 | 0 | INTERNET | 8.967865 | windows | 0 | 1 | 0 | 4 |
| 999997 | 0 | 0.8 | 0.879403 | -1 | 30 | 20 | 0.018563 | 34.692760 | AA | 1522 | 8063.102636 | 5670.654316 | 4377.196321 | 2023 | 6 | CF | 64 | 0 | BC | 0 | 1 | 11 | 0 | 200.0 | 0 | INTERNET | 8.195531 | other | 0 | 1 | 0 | 4 |
| 999998 | 0 | 0.9 | 0.762112 | -1 | 189 | 20 | 0.015352 | 94.661055 | AA | 1418 | 8092.641762 | 3982.582204 | 4394.803296 | 1678 | 6 | CA | 163 | 0 | BA | 1 | 0 | 28 | 0 | 500.0 | 0 | INTERNET | 4.336064 | windows | 1 | 1 | 0 | 4 |
| 999999 | 0 | 0.2 | 0.697452 | -1 | 321 | 20 | 2.655916 | 9.908499 | AA | 951 | 6169.630036 | 3695.308261 | 4352.334543 | 2 | 12 | CA | 36 | 1 | BE | 0 | 1 | 15 | 0 | 200.0 | 0 | INTERNET | 6.717022 | linux | 0 | 1 | 0 | 4 |
1000000 rows × 32 columns
Guardamos en una lista aquellas variables que son categóricas
lista_variables_categoricas = [
'payment_type',
'employment_status',
'email_is_free',
'housing_status',
'phone_home_valid',
'phone_mobile_valid',
'has_other_cards',
'foreign_request',
'source',
'device_os',
'keep_alive_session',
'fraud_bool',
'month'
]
Con un bucle cambiamos el type de esas variables a 'category'
for i in lista_variables_categoricas:
pd_fraud[i] = pd_fraud[i].astype('category')
pd_fraud.dtypes
fraud_bool category income float64 name_email_similarity float64 prev_address_months_count int64 current_address_months_count int64 customer_age int64 days_since_request float64 intended_balcon_amount float64 payment_type category zip_count_4w int64 velocity_6h float64 velocity_24h float64 velocity_4w float64 bank_branch_count_8w int64 date_of_birth_distinct_emails_4w int64 employment_status category credit_risk_score int64 email_is_free category housing_status category phone_home_valid category phone_mobile_valid category bank_months_count int64 has_other_cards category proposed_credit_limit float64 foreign_request category source category session_length_in_minutes float64 device_os category keep_alive_session category device_distinct_emails_8w int64 device_fraud_count int64 month category dtype: object
Guardamos todas las variables que son numéricas
todas_las_variables = pd_fraud.columns
lista_variables_numericas = []
for variable in todas_las_variables:
if variable not in lista_variables_categoricas:
lista_variables_numericas.append(variable)
lista_variables_numericas
['income', 'name_email_similarity', 'prev_address_months_count', 'current_address_months_count', 'customer_age', 'days_since_request', 'intended_balcon_amount', 'zip_count_4w', 'velocity_6h', 'velocity_24h', 'velocity_4w', 'bank_branch_count_8w', 'date_of_birth_distinct_emails_4w', 'credit_risk_score', 'bank_months_count', 'proposed_credit_limit', 'session_length_in_minutes', 'device_distinct_emails_8w', 'device_fraud_count']
Separación en train y test estratificado¶
Visualizamos la proporción de la variable objetivo en el dataset
pd_fraude_bool = pd_fraud['fraud_bool'].value_counts(normalize = True).mul(100).reset_index()
pd_fraude_bool['conteo'] = pd_fraud['fraud_bool'].value_counts()
pd_fraude_bool
| fraud_bool | proportion | conteo | |
|---|---|---|---|
| 0 | 0 | 98.8971 | 988971 |
| 1 | 1 | 1.1029 | 11029 |
La variable objetivo representa en un 98.9% que no se cometió fraude y en un 1.1% que sí se cometió
fig = px.histogram(pd_fraude_bool, x="fraud_bool", y=['proportion'], title = 'Proporción de fraudes',
labels = {"fraud_bool": "fraude"})
fig.show()
Separamos los datos en dos conjuntos en los que se mantenga una proporción muy similar de la variable objetivo
X_pd_fraud, X_pd_fraud_test, y_pd_fraud, y_pd_fraud_test = train_test_split(pd_fraud.drop('fraud_bool',axis=1),
pd_fraud['fraud_bool'],
stratify=pd_fraud['fraud_bool'],
test_size=0.2)
pd_fraud_train = pd.concat([X_pd_fraud, y_pd_fraud],axis=1)
pd_fraud_test = pd.concat([X_pd_fraud_test, y_pd_fraud_test],axis=1)
print('== Train\n', pd_fraud_train['fraud_bool'].value_counts(normalize=True))
print('== Test\n', pd_fraud_test['fraud_bool'].value_counts(normalize=True))
== Train fraud_bool 0 0.988971 1 0.011029 Name: proportion, dtype: float64 == Test fraud_bool 0 0.98897 1 0.01103 Name: proportion, dtype: float64
Visualización descriptiva de los datos¶
lista_nulos_dif = ['prev_address_months_count', 'current_address_months_count', 'bank_months_count',\
'session_length_in_minutes', 'device_distinct_emails_8w', 'intended_balcon_amount']
pd_series_null_columns = pd_fraud_train\
.drop(lista_nulos_dif, axis=1)\
.isnull().sum().sort_values(ascending=False)
# ##
lista_nulos_neg_1 = ['prev_address_months_count', 'current_address_months_count', 'bank_months_count',\
'session_length_in_minutes', 'device_distinct_emails_8w']
pd_series_null_columns2 = (pd_fraud_train[lista_nulos_neg_1] == -1)\
.sum().sort_values(ascending=False)
# ##
pd_series_null_columns3 = (pd_fraud_train['intended_balcon_amount'] < 0).sum()
pd_series_null_columns3 = pd.Series(pd_series_null_columns3, index = ['intended_balcon_amount'])
# ##
pd_null_columnas_final = pd.concat((pd_series_null_columns, pd_series_null_columns2, pd_series_null_columns3))\
.sort_values(ascending = False)
pd_null_columnas = pd.DataFrame(pd_null_columnas_final, columns=['nulos_columnas'])
pd_null_columnas['porcentaje_columnas'] = pd_null_columnas['nulos_columnas']/pd_fraud_train.shape[0]
pd_null_columnas
| nulos_columnas | porcentaje_columnas | |
|---|---|---|
| intended_balcon_amount | 594311 | 0.742889 |
| prev_address_months_count | 570079 | 0.712599 |
| bank_months_count | 203164 | 0.253955 |
| current_address_months_count | 3394 | 0.004242 |
| session_length_in_minutes | 1618 | 0.002022 |
| device_distinct_emails_8w | 293 | 0.000366 |
| name_email_similarity | 0 | 0.000000 |
| fraud_bool | 0 | 0.000000 |
| customer_age | 0 | 0.000000 |
| days_since_request | 0 | 0.000000 |
| payment_type | 0 | 0.000000 |
| zip_count_4w | 0 | 0.000000 |
| velocity_6h | 0 | 0.000000 |
| velocity_24h | 0 | 0.000000 |
| velocity_4w | 0 | 0.000000 |
| bank_branch_count_8w | 0 | 0.000000 |
| income | 0 | 0.000000 |
| employment_status | 0 | 0.000000 |
| credit_risk_score | 0 | 0.000000 |
| email_is_free | 0 | 0.000000 |
| housing_status | 0 | 0.000000 |
| phone_home_valid | 0 | 0.000000 |
| phone_mobile_valid | 0 | 0.000000 |
| has_other_cards | 0 | 0.000000 |
| proposed_credit_limit | 0 | 0.000000 |
| foreign_request | 0 | 0.000000 |
| source | 0 | 0.000000 |
| device_os | 0 | 0.000000 |
| keep_alive_session | 0 | 0.000000 |
| device_fraud_count | 0 | 0.000000 |
| month | 0 | 0.000000 |
| date_of_birth_distinct_emails_4w | 0 | 0.000000 |
Observamos la cantidad de valores missing en las variables del dataset de entrenamiento. Vemos que las variables 'intended_balcon_amount', 'prev_address_months_count' y 'bank_months_count' poseen una gran cantidad de valores missing, con un 74.3, 71.3 y 25.4% respectivamente de toda la muestra
pd_fraud_train['prev_address_months_count'] = pd_fraud_train['prev_address_months_count'].astype(float)
pd_fraud_train['current_address_months_count'] = pd_fraud_train['current_address_months_count'].astype(float)
pd_fraud_train['zip_count_4w'] = pd_fraud_train['zip_count_4w'].astype(float)
pd_fraud_train['bank_branch_count_8w'] = pd_fraud_train['bank_branch_count_8w'].astype(float)
pd_fraud_train['credit_risk_score'] = pd_fraud_train['credit_risk_score'].astype(float)
Hacemos las transformación de estas variables para que las gráficas se vean mejor, después las volvemos a convertir
warnings.filterwarnings('ignore')
target = 'fraud_bool'
for i in todas_las_variables: #list_var_numerica:
print(i)
#plot_feature(pd_fraud_train.head(1000),
#col_name = i, isContinuous = True, target = target)
if (pd_fraud_train[i].dtype==float) & (i!='fraud_bool'):
plot_feature(pd_fraud_train, col_name=i, isContinuous=True, target='fraud_bool')
elif i!='fraud_bool':
plot_feature(pd_fraud_train, col_name=i, isContinuous=False, target='fraud_bool')
fraud_bool income name_email_similarity prev_address_months_count current_address_months_count customer_age days_since_request intended_balcon_amount payment_type zip_count_4w velocity_6h velocity_24h velocity_4w bank_branch_count_8w date_of_birth_distinct_emails_4w employment_status credit_risk_score email_is_free housing_status phone_home_valid phone_mobile_valid bank_months_count has_other_cards proposed_credit_limit foreign_request source session_length_in_minutes device_os keep_alive_session device_distinct_emails_8w device_fraud_count month
He intentado configurar los datos y la función de varias formas ya que parece que no se visualizan muy correctamente las distintas variables y me he quedado con esta configuración que es en la que mejor se ve en general.
En los gráficos se puede ver que se mantiene aproximadamente la misma proporción de fraudes cometidos y no cometidos a lo largo de las distintas variables. Los casos más interesantes pueden ser en la variable 'bank_months_count' donde existe un pico en la proporción de fraudes cometidos en el valor 17, y también en la variable 'customer_age', donde va aumentando la proporción de fraudes cometidos a medida que la edad es mayor.
pd_fraud_train['prev_address_months_count'] = pd_fraud_train['prev_address_months_count'].astype(int)
pd_fraud_train['current_address_months_count'] = pd_fraud_train['current_address_months_count'].astype(int)
pd_fraud_train['zip_count_4w'] = pd_fraud_train['zip_count_4w'].astype(int)
pd_fraud_train['bank_branch_count_8w'] = pd_fraud_train['bank_branch_count_8w'].astype(int)
pd_fraud_train['credit_risk_score'] = pd_fraud_train['credit_risk_score'].astype(int)
Tratamiento de las variables continuas¶
lista_variables_continuas = []
for variable in pd_fraud.columns:
if (pd_fraud[variable].dtypes == 'float'):
lista_variables_continuas.append(variable)
lista_variables_continuas
['income', 'name_email_similarity', 'days_since_request', 'intended_balcon_amount', 'velocity_6h', 'velocity_24h', 'velocity_4w', 'proposed_credit_limit', 'session_length_in_minutes']
Tratamiento de outliers¶
get_deviation_of_mean_perc(pd_fraud_train, lista_variables_continuas, target='fraud_bool', multiplier=3)
| 0.0 | 1.0 | variable | sum_outlier_values | porcentaje_sum_null_values | |
|---|---|---|---|---|---|
| 0 | 0.987643 | 0.012357 | days_since_request | 14162 | 0.017702 |
| 1 | 0.990258 | 0.009742 | intended_balcon_amount | 15089 | 0.018861 |
| 2 | 0.992770 | 0.007230 | velocity_6h | 3458 | 0.004322 |
| 3 | 0.997585 | 0.002415 | velocity_24h | 414 | 0.000517 |
| 4 | 0.871431 | 0.128569 | proposed_credit_limit | 4939 | 0.006174 |
| 5 | 0.979379 | 0.020621 | session_length_in_minutes | 18816 | 0.023520 |
En la variable 'proposed_credit_limit' vemos que tiene un mayor porcentaje de fraude cometido, con un 13%. Después del análisis, y como no hay demasiados outliers, he decidido dejarlos sin sustituir.
Correlaciones¶
get_corr_matrix(dataset = pd_fraud_train[lista_variables_continuas],
metodo='pearson', size_figure=[10,8])
0
corr = pd_fraud_train[lista_variables_continuas].corr('pearson')
new_corr = corr.abs()
new_corr.loc[:,:] = np.tril(new_corr, k=-1) # below main lower triangle of an array
new_corr = new_corr.stack().to_frame('correlation').reset_index().sort_values(by='correlation', ascending=False)
new_corr[new_corr['correlation']>0.4]
| level_0 | level_1 | correlation | |
|---|---|---|---|
| 59 | velocity_4w | velocity_24h | 0.539547 |
| 49 | velocity_24h | velocity_6h | 0.464186 |
| 58 | velocity_4w | velocity_6h | 0.400398 |
Se ve claramente en el gráfico y la tabla que las variables con mayor correlación son: velocity_4w con velocity_24h, velocity_24h con velocity_6h y velocity_6h con velocity_4w.
Aquí determino que no influye la colinealidad de estas variables en los resultados del análisis por lo que decido no eliminarlas
Tratamiento de valores nulos¶
get_percent_null_values_target(pd_fraud_train, lista_variables_continuas, target='fraud_bool')
| 0.0 | 1.0 | variable | sum_null_values | porcentaje_sum_null_values | |
|---|---|---|---|---|---|
| 0 | 0.986874 | 0.013126 | intended_balcon_amount | 594311 | 0.742889 |
| 1 | 0.993820 | 0.006180 | session_length_in_minutes | 1618 | 0.002022 |
En este caso, la variable 'intended_balcon_amount' presenta un gran número de valores nulos en dataframe de entrenamiento, por lo que eliminar las filas donde esta variable sea nula está descartado.
Este algoritmo parece que no se ve muy afectado por la presencia de valores nulos en dicha variabe, por lo que la opción que se toma aquí es la de aceptar esos valores en el dataframe.
Tratamiento de las variables categóricas¶
Para calcular la correlación de spearman, primero tengo que convertir las variables categóricas a numéricas
lista_variables_categoricas
['payment_type', 'employment_status', 'email_is_free', 'housing_status', 'phone_home_valid', 'phone_mobile_valid', 'has_other_cards', 'foreign_request', 'source', 'device_os', 'keep_alive_session', 'fraud_bool', 'month']
pd_fraud_train[lista_variables_categoricas].select_dtypes(int).columns
Index([], dtype='object')
confusion_matrix = pd.crosstab(pd_fraud_train["keep_alive_session"], pd_fraud_train["fraud_bool"])
print(confusion_matrix)
cramers_v(confusion_matrix.values)
fraud_bool 0 1 keep_alive_session 0 332634 5820 1 458543 3003
0.050542586112449245
Después de probar entre diferentes variables, la mayor correlación con la variable objetivo la tiene 'keep_alive_session'
Tratamiento de valores nulos¶
En este caso, las variables categóricas no presentan valores nulos, por lo que no se tomará ninguna medida.
Guardado de los dataframe¶
pd_fraud_train.to_csv('./data_preprocessing/train_pd_data_preprocessing_missing_outlier.csv')
pd_fraud_test.to_csv('./data_preprocessing/test_pd_data_preprocessing_missing_outlier.csv')